############################################################
# DigitAnalysis R Package
# Code for Paper: "Detecting Corruption: Evidence from a World Bank Project in Kenya"
# Authors names blinded during review process
# Working Version: November 30, 2023
############################################################

#### Read R package
library(devtools)
install_github("jlederluis/digitanalysis", force = TRUE)

library(digitanalysis)
library(ggplot2)

### Load Data
fp = "wbdata.csv"

# Pre-clean data: group training, travel, vehicles
D <- read.csv(fp, stringsAsFactors = FALSE)
D$SectorGroup <- D$SECTOR
unique(D$SECTOR)
D$SectorGroup[D$SectorGroup == "TRN" | D$SectorGroup == "TRAVEL" | D$SectorGroup == "VEHICLES"] <- "TRN_TRV_VEH"

# Fix single stray line
D <- D[D$YEAR != "Turkana",] 

# Process data 
Data = process_digit_data(raw_df = D, digit_columns = c('ALEXP.Values', "BENTOT.Values", "BENM", "BENF"))

# Make collapsed data for sector plot
D$SectorGroup[D$SectorGroup == "GE" | D$SectorGroup == "CW"] <- "GE_CW"
DataSector = process_digit_data(raw_df = D, digit_columns = c('ALEXP.Values', "BENTOT.Values", "BENM", "BENF"))


#######################################
# Plot toggle: turn on before plotting
plot_toggle = TRUE 

##########################################################################################
#All digits test except first with expenditure
ADT_ALEXP = all_digits_test(digitdata = Data, data_columns = 'ALEXP.Values', skip_first_digit = TRUE, omit_05 = c(0,5), break_out='DIST',
                            suppress_first_division_plots=TRUE, plot=plot_toggle)
ADT_ALEXP_ALL = all_digits_test(digitdata = Data, data_columns = 'ALEXP.Values', skip_first_digit = TRUE, omit_05 = c(0,5),
                            suppress_first_division_plots=TRUE, plot=plot_toggle)
ADT_ALEXP_ALL$sample_sizes

#save figures
t<-ADT_ALEXP$plots$AllBreakout$AllCategory$aggregate_barplot
pdf("Figures/ADP_expenditure.pdf")
t+ ggtitle("") + theme(legend.position = "none") + xlab("Expenditure Digits from All Digit Places  \n (Excluding 0s and 5s)") +  scale_y_continuous(labels = scales::percent)
dev.off()


########################################################################
#All digits test except first with participants
ADT_BEN = all_digits_test(digitdata = Data, data_columns = c("BENM", "BENF"), skip_first_digit = TRUE, omit_05 = c(0,5), break_out='DIST',
                          suppress_first_division_plots=TRUE, plot=plot_toggle)
ADT_BEN$sample_sizes

t<-ADT_BEN$plots$AllBreakout$AllCategory$aggregate_barplot

pdf("Figures/ADP_beneficiary.pdf")
t+ ggtitle("") + theme(legend.position = "none") + xlab("Participant Digits from All Digit Places Beyond First  \n (Excluding 0s and 5s)") +  scale_y_continuous(labels = scales::percent)
dev.off()

########################################################################
#First digit test with expenditure
first_digit = all_digits_test(digitdata = Data, data_columns = 'ALEXP.Values', digit_places = 1, omit_05 = 0, break_out='DIST',
                              suppress_first_division_plots=TRUE, plot=plot_toggle)

all<- first_digit$plots$AllBreakout$AllCategory$aggregate_barplot + ggtitle("") + theme(legend.position = "none") +  scale_y_continuous(labels = scales::percent) + coord_cartesian(ylim =c(0, .3))
ijara <- first_digit$plots$Ijara$AllCategory$aggregate_barplot+ ggtitle("") + theme(legend.position = "none") +  scale_y_continuous(labels = scales::percent)+ coord_cartesian(ylim =c(0, .3))


pdf("Figures/FirstDigits.pdf")
library(gridExtra)
grid.arrange(all, ijara, ncol = 1)
dev.off()

pdf("Figures/FirstDigitsAll.pdf")
all
dev.off()

pdf("Figures/FirstDigitsIjara.pdf")
ijara
dev.off()

########################################################################
#Padding test with expenditure

#To simulate p-vals
padding = padding_test(digitdata = Data, data_columns = 'ALEXP.Values', break_out = "DIST", max_length=7, num_digits=5, N=100000, omit_05=c(0,5), simulate=T, suppress_first_division_plots=TRUE, plot=TRUE)


# To produce plot
padding = padding_test(digitdata = Data, data_columns = 'ALEXP.Values', break_out = "DIST", max_length=7, num_digits=5, N=1, omit_05=c(0,5),
                       simulate=F, suppress_first_division_plots=TRUE, plot=plot_toggle)

# By category 

t<- padding$AllBreakout$plot

pdf("Figures/PaddingTest.pdf")
t+ ggtitle("") + theme(legend.position = "none")
dev.off()

########################################################################
#Digit pair test with participants
digit_pair = digit_pairs_test(digitdata = Data, data_columns = 'BENTOT.Values', omit_05 = 0, break_out='DIST', plot=plot_toggle)


pdf("Figures/DigitPairs.pdf")
digit_pair$plot+  ggtitle("") + xlab("District") + scale_y_continuous(labels = scales::percent)
dev.off()


########################################################################
#Rounding test with expenditure
rounding = rounding_test(digitdata = Data, data_columns = 'ALEXP.Values', break_out='DIST',
                         rounding_patterns = c('0','00','000','0000', '00000', '000000', '5', '50', '500'), plot="Save")

rounding$p_values
pdf("Figures/Rounding.pdf")
rounding$plot +ggtitle("")+ xlab("District") + scale_y_continuous(labels = scales::percent, breaks = seq(0, .50, by = .10))
dev.off()

########################################################################
#Repeat test with expenditure
repeats = repeat_test(digitdata = Data, duplicate_matching_cols=c("ALEXP.Values", "YEAR", "DIST", "SECTOR"),
                      break_out='DIST', data_column = 'ALEXP.Values', plot=plot_toggle)

pdf("Figures/Repeats.pdf")
repeats$plot +ggtitle("") + xlab("District")+ scale_y_continuous(labels = scales::percent)
dev.off()


########################################################################
#Sector test with expenditure
sector = sector_test(digitdata = DataSector, category='SectorGroup', duplicate_matching_cols=c("ALEXP.Values", "YEAR", "DIST", "SECTOR"),
                     break_out='DIST', data_column = 'ALEXP.Values',
                     category_instance_analyzing = 'TRN_TRV_VEH', plot=plot_toggle, remove_all_category_visualize = T)


pdf("Figures/Sector.pdf")
sector$plot +ggtitle("") + xlab("District") + scale_fill_manual(values=c("#888888", "#434343", "#000000"), labels=c("Civil Works, Goods & Equipment", "Training, Travel, Vehicles")) +   scale_y_continuous(labels = scales::percent, breaks = seq(0, .50, by = .10))
dev.off()

########################################################################
#Unpack rounded numbers test with participants
unpack = unpack_round_numbers_test(digitdata = Data, rounding_split_column="BENTOT.Values", analysis_columns=c("BENM", "BENF"),
                                   skip_first_digit=TRUE, omit_05=c(0,5), suppress_first_division_plots=TRUE, plot=plot_toggle)


t<- unpack$plots$unround$AllBreakout$AllCategory$aggregate_barplot+ggtitle("Unrounded") + theme(plot.title = element_text(hjust = 0.5))  + coord_cartesian(ylim = c(0, .2)) + scale_y_continuous(labels = scales::percent, breaks = seq(0, .2, by = .05)) + theme(legend.position = "none") 
s<- unpack$plots$round$AllBreakout$AllCategory$aggregate_barplot+ggtitle("Rounded") + theme(plot.title = element_text(hjust = 0.5))  + coord_cartesian(ylim = c(0, .2)) + scale_y_continuous(labels = scales::percent, breaks = seq(0, .2, by = .05)) + theme(legend.position = "none")

pdf("Figures/UnpackRound.pdf")
library(gridExtra)
grid.arrange(s, t, ncol = 2)
dev.off()

# For p-values
unpack = unpack_round_numbers_test(digitdata = Data, rounding_split_column="BENTOT.Values", break_out = "DIST", analysis_columns=c("BENM", "BENF"),
                                   skip_first_digit=TRUE, omit_05=c(0,5), suppress_first_division_plots=TRUE, plot=plot_toggle)

########################################################################
# Last digit test for appendix
last_digit_exp = all_digits_test(digitdata = Data, digit_places = -1, break_out = "DIST", data_columns = 'ALEXP.Values', omit_05 = c(0,5),
                              suppress_first_division_plots=TRUE, plot=TRUE)

pdf("Figures/LastDigitsExpenditure")
last_digit_exp$plots$AllBreakout$AllCategory$aggregate_barplot+ggtitle("")+theme(legend.position = "none")
dev.off()

last_digit_ben = all_digits_test(digitdata = Data, digit_places = -1, data_columns = c("BENM", "BENF"), omit_05 = c(0,5),
                              suppress_first_division_plots=TRUE, plot=plot_toggle)

pdf("Figures/LastDigitsBen")
last_digit_ben$plots$AllBreakout$AllCategory$aggregate_barplot+ggtitle("")+theme(legend.position = "none")
dev.off()

########################################################################

## Padding test by year with simulation
 # padding_year = padding_test(digitdata = Data, data_columns = 'ALEXP.Values', break_out = "DIST", category = "YEAR", max_length=7, num_digits=5, N=10000, omit_05=c(0,5),
#                       simulate=T, suppress_first_division_plots=TRUE, plot=TRUE)

## Padding test by year clean plot 
padding_year = padding_test(digitdata = Data, data_columns = 'ALEXP.Values', break_out = "DIST", category = "YEAR", max_length=7, num_digits=5, N=100, omit_05=c(0,5),
                       simulate=F, suppress_first_division_plots=TRUE, plot=TRUE)

p<- padding_year$plots$All

p + scale_fill_manual(values=c("#FFFFFF", "#D7D7D7", "#000000", "#B4B4B4", "#969696"), labels=c("", "2003-2006", "2007", "2008", "2009"))

pdf("PaddingYear.pdf")
padding_year$plot$All
dev.off()


pdf("PaddingYearNew.pdf")
p + scale_fill_manual(values=c("#FFFFFF", "#D7D7D7", "#000000", "#B4B4B4", "#969696"), labels=c("", "2003-2006", "2007", "2008", "2009"))
dev.off()



sink(file = "padding_year.txt")
print("Deviation from Mean Values")
padding_year$diff_in_mean
print("P-Values")
padding_year$p_values
sink()




########################################################################

# P-Value printout to produce final table

sink(file = "P-Values.txt")
print("ADP Expenditure")
ADT_ALEXP$p_values
print("ADP Beneficiaries")
ADT_BEN$p_values

print("First Digits")
first_digit$p_values

print("Digit Pairs")
digit_pair$p_values

print("Sector Repeats")
sector$p_values

print("Repeats")
repeats$p_values

print("Rounding")
rounding$p_values

print("Unpack Round")
unpack$p_values$round

sink()

sink(file = "Padding_AllYear.txt")
print(padding$p_values)
sink()



